#! /usr/bin/env python

#
# 
# Copyright (c) 2007-2013, University of California / Singapore Management University
#   Lingxiao Jiang         <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
#   Ghassan Misherghi      <ghassanm@ucdavis.edu>
#   Zhendong Su            <su@ucdavis.edu>
#   Stephane Glondu        <steph@glondu.net>
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the University of California nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# 
#

import re
import sys

if len(sys.argv) != 2:
    print "Usage: ", sys.argv[0], " <orig clone reports>"
    sys.exit(1)

#required_lines= int(sys.argv[2])
el= re.compile('.*:(\d+) NODE_KIND.*')
def vector_linecount( vector ):
    m= el.match(vector)
    if m == None:
        return -1
    return int( m.group(1) )

#NOTE: filenames can contain spaces...
range_r= re.compile('.*FILE (.*) LINE:(\d+):(\d+) NODE_KIND.*')
def vector_linerange( vector ):
    m= range_r.match(vector)
    if m == None:
        return (-1,-1,-1)
    return ( int(m.group(2)), int(m.group(2))+int(m.group(3))-1, m.group(1) )


vc= re.compile('.*nVARs:(\d+) NUM_NODE.*')
def vector_varcount( vector ):
    m= vc.match(vector)
    if m == None:
        return -1
    return int( m.group(1) )

    
def has_enough_lines( cluster ):
    for vector in cluster:
        lines= vector_linecount( vector )
        if lines >= required_lines:
            return True
    return False

def line_diff_big( cluster ):
    maxl= reduce( lambda x,y: max(x,y), map( vector_linecount, cluster ))
    minl= reduce( lambda x,y: min(x,y), map( vector_linecount, cluster ))
    if maxl>minl+5:
        return False
    return True

def count_duplicates( l, i ):
    n= 0
    for item in l:
        if i==item:
            n+=1
    return n

def only_one_diff( cluster ):
    counts= map( vector_varcount, cluster )
    counts.sort()
    if counts[-1] - counts[0] > 2:
        return False
    front= count_duplicates(counts,counts[0])
    back= count_duplicates(counts,counts[-1])
    if front < len(counts) and front >= len(counts)-2:
        return True
    if back < len(counts) and back >= len(counts)-2:
        return True
    return False

#TODO: may consider to tighten the criteria and reduce the amount of overlapping clone reports. 
def remove_staggered( cluster ):
    rs= map( vector_linerange, cluster )
    keep= map( lambda x: True, cluster )
    for i in range(len(rs)-1):
        for j in range(i+1,len(rs)):
            if rs[j][2] == rs[i][2] and rs[j][0] < rs[i][1] \
                    and rs[j][0] > rs[i][0] and rs[j][1] > rs[i][1] \
                    and (rs[i][1]-rs[j][0])*2 >= rs[i][1]-rs[i][0] \
                    and (rs[j][1]-rs[i][1])*2 <= rs[i][1]-rs[i][0]:
                keep[j]= False
    newlist= []
    for i in range(len(cluster)):
        if keep[i]:
            newlist.append(cluster[i])
    return newlist

def remove_overlaping( cluster ):
    rs= map( vector_linerange, cluster )
    keep= map( lambda x: True, cluster )
    for i in range(len(rs)-1):
        for j in range(i+1,len(rs)):
            if  rs[j][2] == rs[i][2] and rs[j][0] < rs[i][1] \
                    and rs[j][0] > rs[i][0] and rs[j][1] > rs[i][1]: \
                keep[j]= False
    newlist= []
    for i in range(len(cluster)):
        if keep[i]:
            newlist.append(cluster[i])
    return newlist

def cluster_lineranges( cluster ):
    size= len(cluster)
    rsl=([], [], size)
    for i in range(size):
        m= range_r.match(cluster[i])
        if m != None:
            rsl[0].append( (int(m.group(2)), int(m.group(2))+int(m.group(3))-1) )
            rsl[1].append( m.group(1) )
    return rsl

def remove_contained_across_clusters ( clusters ):
#should have no effect on cd_coverage: compare each range and filename (luckily sorted by filenames)
#take quadratic time (from seconds to minutes; range trees will perform better), but it can help to reduce re-parsing time (about 1/3, several minutes)
    rs= map( cluster_lineranges, clusters )
    keep= map( lambda x: True, clusters )
    for i in range(len(rs)):
        for j in range(len(rs)):
            if not keep[i] or not keep[j] or i == j:
                continue
            if rs[j][2] == rs[i][2] and len(rs[j][1]) == len(rs[i][1]):
                filematch= True
                for fn in range(len(rs[i][1])):
                    if rs[i][0][fn][0] >= rs[j][0][fn][0] and rs[i][0][fn][1] <= rs[j][0][fn][1] \
                            and rs[j][1][fn] == rs[i][1][fn]:
                        continue
                    else:
                        filematch= False
                        break
                if filematch:
                    keep[i]= False
    newlist= []
    for i in range(len(clusters)):
        if keep[i]:
            newlist.append(clusters[i])
    return newlist

def remove_contained( cluster ):
    rs= map( vector_linerange, cluster )
    keep= map( lambda x: True, cluster )
    for i in range(len(rs)):
        for j in range(len(rs)):
            if not keep[i] or i == j or not keep[j]:
                continue
            if rs[j][2] == rs[i][2] \
                    and rs[i][0] >= rs[j][0] and rs[i][1] <= rs[j][1]:
                keep[i]= False
    newlist= []
    for i in range(len(cluster)):
        if keep[i]:
            newlist.append(cluster[i])
    return newlist

def not_empty( cluster ):
    if len(cluster) <= 1:
        return False
    return True

# compare locations of two clones:
def cmp_codelocation( c1, c2 ):
    r1 = vector_linerange(c1)
    r2 = vector_linerange(c2)
    if r1[2]==r2[2]:
       if r1[0]<r2[0]: return -1
       elif r1[0]>r2[0]: return 1
       else: return r1[1]-r2[1]
    elif r1[2]<r2[2]: return -1
    else: return 1

def cmp_linecount( c1, c2 ):
    c1l= map( vector_linecount, c1 )
    c2l= map( vector_linecount, c2 )
    m1= reduce( max, c1l )
    m2= reduce( max, c2l )
    if m1 > m2: return -1
    if m1 == m2: return 0
    return 1

def readclusters( file ):
#    print file
    f= open(file,'r')
    clusters= []
    cluster= []
    for line in f.readlines():
        if line == '\n' or line == '':
            if len(cluster) > 0:
                cluster.sort(cmp_codelocation)
                clusters.append(cluster)
                cluster=[]
            continue
        cluster.append(line)
    f.close()
    if len(cluster) > 0:
        cluster.sort(cmp_codelocation)
        clusters.append(cluster)
#    clusters= clusters[1:]
    return clusters


# read in all clone clusters from a file
clusters= readclusters( sys.argv[1] )

# remove clones completely contained in another clone in the same cluster
clusters= map( remove_contained, clusters )

# remove clones overlapped with another clone more than 50%
clusters= map( remove_staggered, clusters )

# remove clones overlapped with another clone more than 0%
# disabled: clusters= map( remove_overlaping, clusters )

# remove clones contained in another clone in another cluster
# disabled for performance: clusters= remove_contained_across_clusters(clusters)

# some of the filtering may only be useful for bug detection purpose, which should and has been separated into another module
#clusters= filter( has_enough_lines , clusters )
#clusters= filter( line_diff_big, clusters )
#clusters= filter( only_one_diff, clusters )

# remove clusters containing less than 2 clones
clusters= filter( not_empty, clusters )
# sort clusters by the cloned lines of code
clusters.sort( cmp_linecount )

# print the result clusters
for cluster in clusters:
    for line in cluster:
        print line,
    print


